# Reads in the DINS data from the geodatabase provided by CalFIRE in response to our public records request.
pacman::p_load(tidyverse, data.table, sf, fst, tidyr)

# Setup ----
source("code/globals.R")

# Load DINS data ----
dins <- st_read(file.path(INPUT_PUBLIC, "DamageData/DINS_2013_2020/All_Incidents_2013_2020_DINS_GDB.gdb")) %>%
  st_drop_geometry()

# Save original address, APN, lon/lat
dins <- dins %>%
  unite("address_orig", c(STREETNUMBER, STREETNAME, STREETTYPE, STREETSUFFIX, CITY, STATE), 
        remove = F, sep = " ", na.rm = T) %>%
  mutate(
    APN_orig = APN,
    address_orig = gsub("\\s+", " ", gsub("NA", "", address_orig)), # Replace NAs and multiple spaces
    lon_orig = Longitude,
    lat_orig = Latitude
  )


# Create FIPS crosswalk
FIPScrosswalk <- tribble(
  ~FIPS, ~cabbr, ~cname,
  6001, "", "Alameda",
  6005, "AMA", "Amador",
  6007, "BUT", "Butte",
  6009, "CAL", "Calaveras",
  6013, "CON", "Contra Costa",
  6017, "ELD", "El Dorado",
  6019, "FRE", "Fresno",
  6021, "", "Glenn",
  6023, "HUM", "Humboldt",
  6027, "", "Inyo",
  6029, "KER", "Kern",
  6031, "KIN", "Kings",
  6033, "LAK", "Lake",
  6035, "LAS", "Lassen",
  6037, "LOS", "Los Angeles",
  6039, "MAD", "Madera",
  6043, "MRP", "Mariposa",
  6045, "MEN", "Mendocino",
  6051, "", "Mono",
  6053, "", "Monterey",
  6055, "NAP", "Napa",
  6057, "NEV", "Nevada",
  6059, "ORA", "Orange",
  6061, "", "Placer",
  6063, "", "Plumas",
  6065, "RIV", "Riverside",
  6067, "", "Sacramento",
  6069, "", "San Benito",
  6071, "", "San Bernardino",
  6073, "SDG", "San Diego",
  6077, "", "San Joaquin",
  6079, "SLO", "San Luis Obispo",
  6081, "", "San Mateo",
  6083, "SBA", "Santa Barbara",
  6085, "", "Santa Clara",
  6087, "SCZ", "Santa Cruz",
  6089, "SHA", "Shasta",
  6093, "SIS", "Siskiyou",
  6095, "SOL", "Solano",
  6097, "SON", "Sonoma",
  6099, "", "Stanislaus",
  6103, "", "Tehama",
  6105, "TRI", "Trinity",
  6107, "", "Tulare",
  6109, "TUO", "Tuolumne",
  6111, "VEN", "Ventura",
  6113, "YOL", "Yolo",
  6115, "YUB", "Yuba") %>%
  mutate(FIPS = as.character(FIPS))

# Save so we don't end up making this again in other code
write_fst(FIPScrosswalk, file.path(WORKING, "county-crosswalk.fst"))

# Join to DINS on both cabbr and cname, since previous versions of DINS have had abbreviations
dins <- dins %>%
  left_join(FIPScrosswalk[, c("FIPS", "cabbr")], by = c("COUNTY" = "cabbr")) %>% # Previous versions of DINS have had abbreviations in county field in some obs. Not an issue in 2020 version of DINS.
  rename(FIPS.cabbr = FIPS) %>%
  left_join(FIPScrosswalk[, c("FIPS", "cname")], by = c("COUNTY" = "cname"))

# Drop county abbreviation
dins <- dins %>% select(-FIPS.cabbr)

# Subset based on structure category and structure type 

dins <- dins %>%
  filter(STRUCTURECATEGORY %in% c(
    "Mixed Commercial/Residential",
    "Multiple Residence",
    "Single Residence",
    "Single Residences"
  ))


dins <- dins %>%
  # remove MOTOR homes now
  filter(!STRUCTURETYPE %in% c("Motor Home")) %>%
  ## Tag MULTI FAMILY structure types for dropping later. There are about 1600 of these in the data.
  mutate(badstructuretype = STRUCTURETYPE %in% c("Multi Family Residence Multi Story", "Multi Family Residence Single Story", "Mixed Commercial/Residential")) %>%
  ## Categorize remaining structure types in SINGLE FAMILY and MOBILE HOME
  mutate(
    singlefam = STRUCTURETYPE %in% c("Single Family Residence Multi Story", "Single Family Residence Single Story", "Single Famliy Residence Single Story"),
    mobilehome = STRUCTURETYPE %in% c("Mobile Home Double Wide", "Mobile Home Single Wide", "Mobile Home Triple Wide"),
    check = badstructuretype + singlefam + mobilehome
  )

# Clean up key fields and start error checking
dins <- dins %>%
  mutate(
    APNoriginal = APN,
    APN = trimws(APN)
  ) ## About 150-ish APNs have no data, but are entered as a space (that is, " "). Trim white space gets rid of these and doesn't seem to cause other problems.

# confirm the only changed APNs are those with ONLY blank chars (" ")
checkwsdrop <- dins %>%
  filter(APN != APNoriginal) %>%
  select(APN, APNoriginal)

## Replace NA and "" APNs with a code (avoids accidental issues with missing values later)
dins <- dins %>%
  mutate(APN = if_else(is.na(APN), "missinginDINS", APN)) %>%
  mutate(APN = if_else(APN == "", "missinginDINS", APN))


# Deal with multiple records on individual APNs within a county 

# Calculate reported copies of each structure type for each APN-incident, make flags for presence of various structure types on an APN-incident group
f <- dins %>%
  group_by(FIPS, APN, INCIDENTNUM) %>%
  summarize(
    numsinglefam = sum(singlefam),
    nummobilehome = sum(mobilehome),
    numbadstructuretype = sum(badstructuretype)
  ) %>%
  mutate(
    anybad = numbadstructuretype > 0,
    anysinglefam = numsinglefam > 0,
    anymobilehome = nummobilehome > 0
  )

dins <- dins %>%
  left_join(f, by = c("FIPS", "APN", "INCIDENTNUM"))

# Handle multiple records with same APN ----
## Rules: 
# If there are any MULTI FAMILY residence structure types, mark all records for that APN with error flag.
# If there are any MIXED COMMERCIAL/RESIDENTIAL structure types, mark all records for that APN with error flag.
# If none of the above structure types, but there are SINGLE FAMILY RESIDENCE structure types, assign that APN the max damage among SINGLE FAMILY structure types on that APN (ignoring any MOBILE HOME structure types)
# If no error flag and no SINGLE FAMILY structure types on the APN, use max damage among MOBILE HOME structure types on the APN.
# This yields three classes of observations that partition the data and I process differently

dins <- dins %>%
  mutate(APNgroup = case_when(
    anybad == TRUE ~ "bad",
    anybad == FALSE & anysinglefam == TRUE ~ "hassinglefam",
    anybad == FALSE & anysinglefam == FALSE ~ "onlymobilehomes"
  ))

# Turn damage character vector into an ordered factor
dins <- dins %>%
  mutate(DAMAGE = factor(DAMAGE,
    levels = c(
      "Inaccessible",
      "No Damage",
      "Affected (1-9%)",
      "Minor (10-25%)",
      "Major (26-50%)",
      "Destroyed (>50%)"
    ), ordered = T
  ))

### Idenfity the MAX damage among RELEVANT structures in each APN group
badAPNs <- dins %>%
  filter(APNgroup == "bad") %>%
  group_by(FIPS, APN, INCIDENTNUM) %>%
  summarize(maxdamage = max(DAMAGE))

withsf <- dins %>%
  filter(APNgroup == "hassinglefam") %>%
  filter(singlefam == TRUE) %>% # Omit mobile home damage reports for these
  group_by(FIPS, APN, INCIDENTNUM) %>%
  summarize(maxdamage = max(DAMAGE))

withoutsf <- dins %>%
  filter(APNgroup == "onlymobilehomes") %>%
  group_by(FIPS, APN, INCIDENTNUM) %>%
  summarize(maxdamage = max(DAMAGE))

damage <- rbind(badAPNs, withsf, withoutsf)

## Keep records with damage matching max damage
dins <- dins %>%
  left_join(damage, by = c("FIPS", "APN", "INCIDENTNUM")) %>%
  filter(DAMAGE == maxdamage) %>%
  ## Drop non single family (mobile homes) on lots with single family homes
  filter(singlefam == TRUE | anysinglefam == FALSE)

# Among remaining duplicate records, select one for each APN
set.seed(26469)
dins <- dins %>%
  group_by(FIPS, APN, INCIDENTNUM) %>%
  slice_sample(n = 1) %>% # randomly chooses 1 row from each group of duplicates
  ungroup()

# Label APNs to NOT USE in analysis due to bad structure types (I don't drop here because I want to exclude them from control group also (i.e., Zillow))
dins <- dins %>%
  mutate(
    howmanystructures = numsinglefam + nummobilehome + numbadstructuretype,
    badrecordflag = anybad == TRUE | howmanystructures > 1
  ) ## The "1" filter for howmanystructures here seems very conservative.

# Final cleaning ------

# Drop if APN is missing
dins <- dins %>%
  filter(APN != "missinginDINS")

# Keep vars of interest
varsofinterest <- c(
  "FIPS", "APN", "DAMAGE", "STREETNUMBER", "STREETNAME", "STREETTYPE", "STREETSUFFIX", "CITY", "STATE", "CALFIREUNIT", "COMMUNITY",
  "INCIDENTNAME", "INCIDENTNUM", "INCIDENTSTARTDATE", "WHEREFIRESTARTEDONSTRUCTURE", "WHATDIDFIRESTARTFROM",
  "PROPANETANKDISTANCE", "STRUCTURETYPE", "NUMBEROFUNITPERSTRUCTURE", "ROOFCONSTRUCTION",
  "EAVES", "VENTSCREEN", "EXTERIORSIDING", "WINDOWPANE", "DECKPORCHONGRADE", "DECKPORCHELEVATED", "PATIOCOVERCARPORT",
  "FENCEATTACHEDTOSTRUCTURE", "UTILITYMISCSTRUCTUREDISTANCE", "badrecordflag", "anysinglefam", "anymobilehome", "badstructuretype", "howmanystructures",
  "DEFENSIVEACTIONS", "APN_orig", "address_orig", "lon_orig", "lat_orig"
)
dins <- dins %>%
  select(all_of(varsofinterest)) %>%
  # Create UnformattedAssessorParcelNumber var for merge to ZTRAX
  mutate(UnformattedAssessorParcelNumber = APN) %>%
  # Numeric FIPS for consistency with other data sources
  mutate(FIPS = as.numeric(FIPS)) %>%
  mutate(in_damage_data = 1) # Indicator for if this observation comes from the damage data

# Reset DAMAGE to character (from factor) since it comes in as a character variable in other non-DINS damage sources
dins <- dins %>% mutate(DAMAGE = as.character(DAMAGE))

# Make a date field and clean up a couple problem incidents with inconsistent dates
dins <- dins %>%
  mutate(date = as.Date(INCIDENTSTARTDATE)) %>%
  mutate(date = if_else(INCIDENTNUM == "CALNU 010104", as.Date("2017-10-08"), date))

# Clean up inconsistent incident names
dins <- dins %>%
  mutate(INCIDENTNAME = case_when(
    INCIDENTNUM == "CAMEU 008674" ~ "Ranch",
    INCIDENTNUM == "CALNU 010104" ~ "Tubbs Fire",
    INCIDENTNUM == "CANEU 026295" ~ "Laporte",
    TRUE ~ INCIDENTNAME
  ))

# Make a list of FIPS to know what to import from ZTRAX
saveRDS(unique(dins$FIPS), file.path(WORKING, "counties_in_dins_data.RDS"))

# Counties with incidents before Sept 2016 -- these are the counties I need to import data for from historical assessment files
saveRDS(unique(dins$FIPS[dins$date < as.Date("2016-09-01")]), file.path(WORKING, "counties_before_sept_2016.Rds"))

# Finally, save full dataset
saveRDS(dins, file.path(WORKING, "dins_records.Rds"))
